{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dolma.core.paths import cached_path, mkdir_p, parent\n",
    "import json\n",
    "import smart_open\n",
    "import re\n",
    "import os\n",
    "os.environ[\"AWS_PROFILE\"] = \"r2\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "page = \"https://github.com/StevenBlack/hosts/raw/master/readme.md\"\n",
    "readme_links = {}\n",
    "started = False\n",
    "with smart_open.open(cached_path(page)) as f:\n",
    "    for ln in f:\n",
    "        if not started:\n",
    "            if ln.startswith(\"| ---------------- |\"):\n",
    "                started = True\n",
    "            continue\n",
    "        elif not ln.strip():\n",
    "            break\n",
    "\n",
    "        name, readme, link, count, _ = ln.split(\"|\")\n",
    "        if \"=\" in name:\n",
    "            name = \"adware_malware\"\n",
    "        elif \"Unified hosts\" in name:\n",
    "            name = name.replace(\"Unified hosts\", \"adware_malware\")\n",
    "        name = re.sub(\n",
    "            r\"_+\",\n",
    "            \"_\",\n",
    "            re.sub(r\"[^a-zA-Z0-9]\", \"_\", name.strip().lower().replace(\"'s\", \"\"))\n",
    "        ).strip(\"_\")\n",
    "        url = link.strip().split(\"](\", 1)[1].rstrip(\")\")\n",
    "        readme_links[name] = url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloaded adware_malware from https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware.txt.gz\n",
      "Downloaded adware_malware_fakenews from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_fakenews.txt.gz\n",
      "Downloaded fakenews from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/fakenews.txt.gz\n",
      "Downloaded adware_malware_gambling from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/gambling/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_gambling.txt.gz\n",
      "Downloaded gambling from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/gambling-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/gambling.txt.gz\n",
      "Downloaded adware_malware_porn from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/porn/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_porn.txt.gz\n",
      "Downloaded porn from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/porn-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/porn.txt.gz\n",
      "Downloaded adware_malware_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/social/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_social.txt.gz\n",
      "Downloaded social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/social-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/social.txt.gz\n",
      "Downloaded adware_malware_fakenews_gambling from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_fakenews_gambling.txt.gz\n",
      "Downloaded fakenews_gambling from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/fakenews_gambling.txt.gz\n",
      "Downloaded adware_malware_fakenews_porn from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-porn/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_fakenews_porn.txt.gz\n",
      "Downloaded fakenews_porn from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-porn-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/fakenews_porn.txt.gz\n",
      "Downloaded adware_malware_fakenews_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-social/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_fakenews_social.txt.gz\n",
      "Downloaded fakenews_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-social-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/fakenews_social.txt.gz\n",
      "Downloaded adware_malware_gambling_porn from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/gambling-porn/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_gambling_porn.txt.gz\n",
      "Downloaded gambling_porn from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/gambling-porn-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/gambling_porn.txt.gz\n",
      "Downloaded adware_malware_gambling_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/gambling-social/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_gambling_social.txt.gz\n",
      "Downloaded gambling_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/gambling-social-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/gambling_social.txt.gz\n",
      "Downloaded adware_malware_porn_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/porn-social/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_porn_social.txt.gz\n",
      "Downloaded porn_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/porn-social-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/porn_social.txt.gz\n",
      "Downloaded adware_malware_fakenews_gambling_porn from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-porn/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_fakenews_gambling_porn.txt.gz\n",
      "Downloaded fakenews_gambling_porn from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-porn-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/fakenews_gambling_porn.txt.gz\n",
      "Downloaded adware_malware_fakenews_gambling_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-social/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_fakenews_gambling_social.txt.gz\n",
      "Downloaded fakenews_gambling_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-social-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/fakenews_gambling_social.txt.gz\n",
      "Downloaded adware_malware_fakenews_porn_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-porn-social/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_fakenews_porn_social.txt.gz\n",
      "Downloaded fakenews_porn_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-porn-social-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/fakenews_porn_social.txt.gz\n",
      "Downloaded adware_malware_gambling_porn_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/gambling-porn-social/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_gambling_porn_social.txt.gz\n",
      "Downloaded gambling_porn_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/gambling-porn-social-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/gambling_porn_social.txt.gz\n",
      "Downloaded adware_malware_fakenews_gambling_porn_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-porn-social/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/adware_malware_fakenews_gambling_porn_social.txt.gz\n",
      "Downloaded fakenews_gambling_porn_social from https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-porn-social-only/hosts to /Users/lucas/Documents/blocklist_hosts/blocklist_hosts-20240208/fakenews_gambling_porn_social.txt.gz\n"
     ]
    }
   ],
   "source": [
    "for name, url in readme_links.items():\n",
    "    dest = f\"{os.path.expanduser('~')}/Documents/blocklist_hosts/blocklist_hosts-20240208/{name}.txt.gz\"\n",
    "    mkdir_p(parent(dest))\n",
    "    with smart_open.open(url, 'rt') as f:\n",
    "        data = f.read()\n",
    "    with smart_open.open(dest, \"wt\", compression=\".gz\") as f:\n",
    "        f.write(data)\n",
    "    print(f\"Downloaded {name} from {url} to {dest}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dolma",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
